{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Plate ID</th>\n",
       "      <th>Registration State</th>\n",
       "      <th>Vehicle Make</th>\n",
       "      <th>Violation Time</th>\n",
       "      <th>Street Name</th>\n",
       "      <th>Vehicle Color</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>J58JKX</td>\n",
       "      <td>NJ</td>\n",
       "      <td>HONDA</td>\n",
       "      <td>0523P</td>\n",
       "      <td>43 ST</td>\n",
       "      <td>BK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>KRE6058</td>\n",
       "      <td>PA</td>\n",
       "      <td>ME/BE</td>\n",
       "      <td>0428P</td>\n",
       "      <td>UNION ST</td>\n",
       "      <td>BLK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>444326R</td>\n",
       "      <td>NJ</td>\n",
       "      <td>LEXUS</td>\n",
       "      <td>0625A</td>\n",
       "      <td>CLERMONT AVENUE</td>\n",
       "      <td>BLACK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>F728330</td>\n",
       "      <td>OH</td>\n",
       "      <td>CHEVR</td>\n",
       "      <td>1106A</td>\n",
       "      <td>DIVISION AVE</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FMY9090</td>\n",
       "      <td>NY</td>\n",
       "      <td>JEEP</td>\n",
       "      <td>1253A</td>\n",
       "      <td>GRAND ST</td>\n",
       "      <td>GREY</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Plate ID Registration State Vehicle Make Violation Time      Street Name  \\\n",
       "0   J58JKX                 NJ        HONDA          0523P            43 ST   \n",
       "1  KRE6058                 PA        ME/BE          0428P         UNION ST   \n",
       "2  444326R                 NJ        LEXUS          0625A  CLERMONT AVENUE   \n",
       "3  F728330                 OH        CHEVR          1106A     DIVISION AVE   \n",
       "4  FMY9090                 NY         JEEP          1253A         GRAND ST   \n",
       "\n",
       "  Vehicle Color  \n",
       "0            BK  \n",
       "1           BLK  \n",
       "2         BLACK  \n",
       "3           NaN  \n",
       "4          GREY  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = '../data/nyc-parking-violations-2020.csv'\n",
    "\n",
    "df = pd.read_csv(filename,\n",
    "                 usecols=['Plate ID',  'Registration State',\n",
    "                        'Vehicle Make', 'Vehicle Color', 'Violation Time', 'Street Name'])\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 1\n",
    "\n",
    "So far, we have specified which columns must all be non-null. But sometimes, it's OK for some number of columns to have null values, so long as it's not too many. How many rows would we eliminate if we require at least three non-null values from the four columns `Plate ID`, `Registration State`, `Vehicle Make`, and `Street Name`?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "253"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "at_least_three_df = df.dropna(subset=['Plate ID', 'Registration State', 'Vehicle Make', 'Street Name'],\n",
    "                           thresh=3)\n",
    "df.shape[0] - at_least_three_df.shape[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 2\n",
    "\n",
    "Which of the columns that we've imported has the greatest number of `NaN` values?  Is this a problem?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Plate ID                 202\n",
       "Registration State         0\n",
       "Vehicle Make           62420\n",
       "Violation Time           278\n",
       "Street Name             1417\n",
       "Vehicle Color         391982\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Vehicle color is most often empty. Is that a problem? Maybe not, assuming we almost always have a plate ID\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 3\n",
    "\n",
    "Null data is bad, but there is plenty of non-null bad data, too. For example, many cars with `BLANKPLATE` as a plate ID were ticketed. Turn these into `NaN` values, and then re-run the previous query."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "944"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use df.replace to change Plate ID of BLANKPLATE to NaN, then drop any rows without at least\n",
    "# three of our four columns non-null\n",
    "no_blankplate_df = df.replace({'Plate ID':'BLANKPLATE'}, np.NaN).dropna(subset=['Plate ID', \n",
    "                                      'Registration State', 'Vehicle Make', 'Street Name'],  \n",
    "                                thresh=3)\n",
    "df.shape[0] - no_blankplate_df.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
